library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.1.0 ✓ dplyr 1.0.4
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(RMySQL)
## Loading required package: DBI
library(knitr)
library(ggmap)
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
library(maps)
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(corrr)
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 0.1.2 ──
## ✓ broom 0.7.4 ✓ recipes 0.1.15
## ✓ dials 0.0.9 ✓ rsample 0.0.9
## ✓ infer 0.5.4 ✓ tune 0.1.3
## ✓ modeldata 0.1.0 ✓ workflows 0.2.2
## ✓ parsnip 0.1.5 ✓ yardstick 0.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## x gridExtra::combine() masks dplyr::combine()
## x scales::discard() masks purrr::discard()
## x dplyr::filter() masks stats::filter()
## x recipes::fixed() masks stringr::fixed()
## x dplyr::lag() masks stats::lag()
## x maps::map() masks purrr::map()
## x yardstick::spec() masks readr::spec()
## x recipes::step() masks stats::step()
library(kknn)
options(scipen = 999)
db = dbConnect(MySQL(),
user = 'ofx_user',
password = 'TestyTester#2021',
dbname = 'OFX',
host = 'ballenger.wlu.edu')
knitr::opts_knit$set(sql.max.print = -1)
buyer_ofx <- dbSendQuery(db, 'SELECT * FROM buyer')
buyer_ofx_table <- fetch(buyer_ofx, n = -1)
category_ofx <- dbSendQuery(db, 'SELECT * FROM category')
category_ofx_table <- fetch(category_ofx, n = -1)
location_ofx <- dbSendQuery(db, 'SELECT * FROM location')
location_ofx_table <- fetch(location_ofx, n = -1)
order_product_ofx <- dbSendQuery(db, 'SELECT * FROM order_product')
order_product_ofx_table <- fetch(order_product_ofx, n = -1)
orders_ofx <- dbSendQuery(db, 'SELECT * FROM orders')
orders_ofx_table <- fetch(orders_ofx, n = -1)
product_ofx <- dbSendQuery(db, 'SELECT * FROM product')
product_ofx_table <- fetch(product_ofx, n = -1)
write_csv(buyer_ofx_table, file = "data/buyer_ofx.csv")
write_csv(location_ofx_table, file = "data/location_ofx.csv")
write_csv(order_product_ofx_table, file = "data/order_product_ofx.csv")
write_csv(product_ofx_table, file = "data/product_ofx.csv")
orders_ofx_table <- orders_ofx_table %>%
mutate(Order_Date = ymd(Order_Date),
Ship_Date = ymd(Ship_Date))
write_csv(orders_ofx_table, file = "data/orders_ofx.csv")
register_google(key = "AIzaSyB14h_szuys3SaTv1WTaJ2WEDfGDN0sl-A")
location_ofx_table <- location_ofx_table %>%
distinct(Postal_Code, City, State, Region) %>%
mutate(city_state = paste(City, State, sep = ", ")) %>%
mutate_geocode(city_state)
write_csv(location_ofx_table, file = "data/location_ofx.csv")
buyer_ofx_table <- buyer_ofx_table %>%
rename_with(tolower)
location_ofx_table <- location_ofx_table %>%
rename_with(tolower)
category_ofx_table <- category_ofx_table %>%
rename_with(tolower)
order_product_ofx_table <- order_product_ofx_table %>%
rename_with(tolower)
orders_ofx_table <- orders_ofx_table %>%
rename_with(tolower )
product_ofx_table <- product_ofx_table %>%
rename_with(tolower)
buyer_ofx_table %>%
summarise(across(everything(), ~ sum(is.na(.))))
## buyer_id last_name first_name type
## 1 0 1 0 0
buyer_ofx_table <- na.omit(buyer_ofx_table)
buyer_ofx_table %>%
summarise(across(everything(), ~ sum(is.na(.))))
## buyer_id last_name first_name type
## 1 0 0 0 0
location_ofx_table %>%
summarise(across(everything(), ~ sum(is.na(.))))
## postal_code city state region
## 1 0 0 0 0
category_ofx_table %>%
summarise(across(everything(), ~ sum(is.na(.))))
## sub_category category
## 1 0 0
order_product_ofx_table %>%
summarise(across(everything(), ~ sum(is.na(.))))
## order_id product_id quantity unit_price discount gross_profit_per_unit
## 1 0 0 0 0 0 0
orders_ofx_table %>%
summarise(across(everything(), ~ sum(is.na(.))))
## order_id order_date ship_date ship_mode buyer_id postal_code
## 1 0 0 0 0 0 0
product_ofx_table %>%
summarise(across(everything(), ~ sum(is.na(.))))
## product_id product_name sub_category
## 1 0 0 0
order_prod_product_ofx <- order_product_ofx_table %>%
inner_join(product_ofx_table, by = "product_id")
location_orders_ofx <- location_ofx_table %>%
inner_join(orders_ofx_table, by = "postal_code")
location_orders_ofx %>%
group_by(region) %>%
summarise(region_order_count = n()) %>%
arrange(desc(region_order_count))
## # A tibble: 4 x 2
## region region_order_count
## <chr> <int>
## 1 West 1611
## 2 East 1401
## 3 Central 1175
## 4 South 822
regional_mapping_data <- location_orders_ofx %>%
group_by(city, state, lon, lat) %>%
summarise(number_of_orders = n())
write_csv(regional_mapping_data, file = "data/regionalmappingdata.csv")
complete_mapping_data <- read_csv("data/regionalmappingdata.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## city = col_character(),
## state = col_character(),
## lon = col_double(),
## lat = col_double(),
## number_of_orders = col_double()
## )
usa_map <- get_map(location = 'USA', zoom = 4, source = 'google', maptype = "terrain", color = "bw" )
## Source : https://maps.googleapis.com/maps/api/staticmap?center=USA&zoom=4&size=640x640&scale=2&maptype=terrain&language=en-EN&key=xxx-A
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=USA&key=xxx-A
ggmap(usa_map, base_layer = ggplot(complete_mapping_data, aes(x = lon, y = lat, size = number_of_orders))) +
geom_point(color = "seagreen", alpha = .5) +
labs(title = "Orders Across the USA", size = "Number of Orders")
Observation: As you can see here, company sales look like they’re heavily concentrated on the coasts of the USA. There aren’t many tight clusters of dots in the middle of the country, excluding Chicago. More thorough analysis is needed to see what exact metropolitan areas company orders are mostly concentrated in on the coasts (see Observation 2).
east_map <- get_map(location = 'New York', zoom = 8, source = 'google', maptype = "terrain", color = "bw" )
## Source : https://maps.googleapis.com/maps/api/staticmap?center=New%20York&zoom=8&size=640x640&scale=2&maptype=terrain&language=en-EN&key=xxx-A
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=New+York&key=xxx-A
west_map <- get_map(location = 'California', zoom = 6, source = 'google', maptype = "terrain", color = "bw")
## Source : https://maps.googleapis.com/maps/api/staticmap?center=California&zoom=6&size=640x640&scale=2&maptype=terrain&language=en-EN&key=xxx-A
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=California&key=xxx-A
east_map_creation <- ggmap(east_map, base_layer = ggplot(complete_mapping_data, aes(x = lon, y = lat, size = number_of_orders))) +
geom_point(color = "blue", alpha = .5) +
labs(title = "East Region Concentration of Orders", color = "East Region Locations", size = "Number of Orders")
west_map_creation <- ggmap(west_map, base_layer = ggplot(complete_mapping_data, aes(x = lon, y = lat, size = number_of_orders))) +
geom_point(color = "red", alpha = .5) +
labs(title = "West Region Concentration of Orders", color = "West Region Locations", size = "Number of Orders")
grid.arrange(east_map_creation, west_map_creation, ncol = 2, nrow = 1)
## Warning: Removed 562 rows containing missing values (geom_point).
## Warning: Removed 516 rows containing missing values (geom_point).
Observation: This map further establishes that sales are concentrated in major cities on the coasts. As can be seen more clearly here, most orders on the West are shipped to LA or San Francisco and the East’s orders are shipped mostly to NYC and Philadelphia. This is useful knowledge; with this data, the company knows that it should shift a significant amount of its resources to cater to these areas. With so many customers, and potential customers, here the company should consider how it can make shipping faster and improve the overall customer experience for those living in these areas.
illinois_map <- get_map(location = 'Illinois', zoom = 6, source = 'google', maptype = "terrain", color = "bw" )
## Source : https://maps.googleapis.com/maps/api/staticmap?center=Illinois&zoom=6&size=640x640&scale=2&maptype=terrain&language=en-EN&key=xxx-A
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Illinois&key=xxx-A
illinois_map_creation <- ggmap(illinois_map, base_layer = ggplot(complete_mapping_data, aes(x = lon, y = lat, size = number_of_orders))) +
geom_point(color = "blue", alpha = .5) +
labs(title = "East Region Concentration of Orders", color = "East Region Locations", size = "Number of Orders")
illinois_map_creation
## Warning: Removed 447 rows containing missing values (geom_point).
location_orders_ofx %>%
filter(city == "Chicago") %>%
summarise(city_order_count = n())
## city_order_count
## 1 171
location_orders_ofx %>%
filter(city == "Philadelphia") %>%
summarise(city_order_count = n())
## city_order_count
## 1 265
Observation: Chicago is the hub of the midwest, and it had about 90 less orders than Philadelphia – one of the best performing cities in the East region. With this in mind, I think Chicago should be given much of the same treatment as the East and West regions going forward in terms of resource allocation. With greater company resources focused on and a greater presence in Chicago, it could become the center-point of attack for the company in the midwest – giving the company a strategic foothold in a part of the country that its had little traction in historically.
order_prod_product_ofx <- order_product_ofx_table %>%
inner_join(product_ofx_table, by = "product_id")
orders_prod_full <- order_prod_product_ofx %>%
inner_join(orders_ofx_table, by = "order_id")
orders_prod_full %>%
group_by(postal_code) %>%
filter(postal_code %in% c("10009", "90004")) %>%
summarise(profit = unit_price * gross_profit_per_unit) %>%
ggplot(aes(x = postal_code, y = profit)) +
geom_col() +
labs(title = "Total Profit for the NYC and LA Areas",
x = "Postal Codes (NYC is 10009, LA is 90004)",
y = "Total Profit ($)")
## `summarise()` has grouped output by 'postal_code'. You can override using the `.groups` argument.
Observation: While New York City and Los Angeles are both hotbeds for orders (from all consumer types ordering items from all sub-categories), New York is FAR more profitable. This is useful knowledge, with it the company can allocate its resources to the West and East regions accordingly.
category_prod_ofx <- category_ofx_table %>%
inner_join(product_ofx_table, by = "sub_category")
category_order_prod_ofx <- category_prod_ofx %>%
inner_join(order_product_ofx_table, by = "product_id")
category_order_prod_ofx %>%
group_by(sub_category) %>%
summarise(profit = unit_price * gross_profit_per_unit) %>%
ggplot() +
geom_col(aes(x = sub_category, y = profit)) +
labs(title = "Profit by Sub Category",
x = "Sub Category",
y= "Profit ($)")
## `summarise()` has grouped output by 'sub_category'. You can override using the `.groups` argument.
Observation: The MOST PROFITABLE sub-category of item the company sells are the copiers. This is in part because they are an expensive item, though. However, this is still important to know since we now know what sub-category is worth marketing aggressively to gain the company more cash in the future.
Yet, we still don’t know what’s the most most popular item (explored in Question 6).
order_prod_product_ofx %>%
group_by(sub_category, quantity) %>%
summarise(product_orders = n()) %>%
ggplot() +
geom_col(aes(x = sub_category, y = product_orders)) +
labs(title = "Product Popularity",
x = "Product Name",
y = "Number of Product Orders")
## `summarise()` has grouped output by 'sub_category'. You can override using the `.groups` argument.
order_prod_product_ofx %>%
group_by(sub_category) %>%
summarise(region_order_count = n()) %>%
arrange(desc(region_order_count)) %>%
head(10)
## # A tibble: 10 x 2
## sub_category region_order_count
## <chr> <int>
## 1 Binders 1522
## 2 Paper 1368
## 3 Furnishings 956
## 4 Phones 889
## 5 Storage 845
## 6 Art 796
## 7 Accessories 773
## 8 Chairs 616
## 9 Appliances 466
## 10 Labels 364
Observation: Binders and paper are the most popular items sold by the company (they have the most orders). This is important to know for future inventory, which will be explored more in observation 7.
product_order_prod_ofx <- category_prod_ofx %>%
inner_join(order_product_ofx_table, by = "product_id")
total_order_product_info <- product_order_prod_ofx %>%
inner_join(orders_ofx_table, by = "order_id")
total_order_product_info %>%
filter(sub_category == "Binders") %>%
mutate(year = year(order_date)) %>%
group_by(year) %>%
summarise(product_orders = sum(quantity)) %>%
ungroup() %>%
ggplot(aes(x = year, y = product_orders)) +
geom_line() +
scale_x_continuous() +
labs(title = "Binder Sales from 2011 Onward", x = "Years", y = "Orders") +
xlim(2011, 2014) +
ylim(100, 3000)
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
total_order_product_info %>%
filter(sub_category == "Paper") %>%
mutate(year = year(order_date)) %>%
group_by(year) %>%
summarise(product_orders = sum(quantity)) %>%
ungroup() %>%
ggplot(aes(x = year, y = product_orders)) +
geom_line() +
scale_x_continuous() +
labs(title = "Paper Sales from 2011 Onward", x = "Years", y = "Orders") +
xlim(2011, 2014) +
ylim(100, 3000)
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
total_order_product_info %>%
filter(sub_category == "Copiers") %>%
mutate(year = year(order_date)) %>%
group_by(year) %>%
summarise(product_orders = sum(quantity)) %>%
ungroup() %>%
ggplot(aes(x = year, y = product_orders)) +
geom_line() +
scale_x_continuous() +
labs(title = "Copier Sales from 2011 Onward", x = "Years", y = "Orders") +
xlim(2011, 2014) +
ylim(10, 100)
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
Observation: It appears that orders of binders and paper have increased from 2011-2014. This means that it may be wise for the company to check its inventory of binders and papers and potentially increase it (anticipating trends continue).
Copier orders were also trending up with consistency, so the company might want think about increasing its inventory on this item to meet future demand, too.
order_prod_product_ofx %>%
group_by(sub_category) %>%
summarise(region_order_count = n()) %>%
arrange(region_order_count) %>%
head(10)
## # A tibble: 10 x 2
## sub_category region_order_count
## <chr> <int>
## 1 Copiers 68
## 2 Machines 115
## 3 Supplies 190
## 4 Fasteners 217
## 5 Bookcases 228
## 6 Envelopes 254
## 7 Tables 319
## 8 Labels 364
## 9 Appliances 466
## 10 Chairs 616
order_prod_product_ofx %>%
group_by(sub_category) %>%
summarise(profit = unit_price * gross_profit_per_unit) %>%
arrange(profit) %>%
head(1)
## `summarise()` has grouped output by 'sub_category'. You can override using the `.groups` argument.
## # A tibble: 1 x 2
## # Groups: sub_category [1]
## sub_category profit
## <chr> <dbl>
## 1 Machines -1919986.
In terms of underperforming items, it seems that machines tend to not only NOT earn a profit but actually lose the business money. This could be a pricing issue or a demand issue (maybe we have too much inventory). This is definitely something to look into.
Some other underperforming items are more expensive, like copiers, but some are less expensive and not selling well (EX: envelopes, fasteners). It might be time to consider getting rid of these items or replacing them.
cor_ofx_data <- order_product_ofx_table %>%
group_by(unit_price, discount, gross_profit_per_unit) %>%
summarise(gross_profit_per_unit = gross_profit_per_unit * sum(quantity))
## `summarise()` has grouped output by 'unit_price', 'discount', 'gross_profit_per_unit'. You can override using the `.groups` argument.
cor_ofx_data %>%
correlate(diagonal = 1) %>%
stretch() %>%
mutate(r = round(r, 2)) %>%
ggplot(aes(x = x, y = y, fill = r)) +
geom_tile() +
geom_text(aes(x = x, y = y, label = r)) +
scale_fill_gradient2(low = "orange", high = "purple", guide = "colorbar") +
labs(title = "Heatmap Representing Correlation of Multiple Items in the Order Product Table", x = "", y = "")
##
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
Observation: While it makes sense that unit price and gross profit per unit are positively correlated to a decent extent, the fact that discount and gross profit per unit were only slightly negatively correlated was more interesting. On the surface, it makes sense on a fundamental level that gross profit would drop as the discount increases and vice versa. However, this -0.2+ correlation might mean that there could be some merit in increasing the use of discounts since the negative correlation isn’t that strong (increasing discount by a little doesn’t have a significant effect on gross profit, so it might be smart to do this).
total_order_product_info_with_buyers <- total_order_product_info %>%
inner_join(buyer_ofx_table, by = "buyer_id")
total_order_product_info_with_buyers %>%
group_by(type) %>%
summarise(profit = unit_price * gross_profit_per_unit) %>%
summarise(mean_profit_per_type = mean(profit)) %>%
ggplot(aes(x = type, y = mean_profit_per_type)) +
geom_bar(stat = "identity") +
labs(title = "Average Profit per Customer Type",
x = "Type",
y = "Avg. Profit ($)")
## `summarise()` has grouped output by 'type'. You can override using the `.groups` argument.
Observation: One might think that the average corporate client would be the most profitable for the company to pursue, but it turns out that regular consumers and those furnishing home offices are the MOST profitable on average. With this information, the company should consider allocating more marketing resources to bring in more regular consumers and people looking to furnish their home offices.
orders_prod_full %>%
mutate(year = year(order_date),
month = month(order_date),
month = factor(month, labels = c("Jan", "Feb","Mar","Apr","May","June","July","Aug", "Sep","Oct", "Nov", "Dec")),
total_sales = unit_price * quantity) %>%
group_by(year, month) %>%
summarise(total_sales = sum(unit_price * quantity)) %>%
ungroup() %>%
ggplot() +
geom_col(aes(x = total_sales, y = month)) +
labs(title = "Bar Chart Showing Seasonality of Sales",
x = "Sales",
y= "Month")+
facet_wrap(~year)
## `summarise()` has grouped output by 'year'. You can override using the `.groups` argument.
Observation: In terms of seasonality, it appears that most sales happen during late fall and early winter. This is important to know so the company can better schedule/calender each quarter and so they can do some research and potentially bump up their numbers during other seasons. This company sells office products, there’s no reason why they should only do a lot of sales in one season – it seems possible that consistency could be achieved.
orders_prod_full_with_buyer <- orders_prod_full %>%
inner_join(buyer_ofx_table, by = "buyer_id")
orders_prod_full_with_buyer %>%
mutate(year = year(order_date),
month = month(order_date),
month = factor(month, labels = c("Jan", "Feb","Mar","Apr","May","June","July","Aug", "Sep","Oct", "Nov", "Dec")),
total_sales = unit_price * quantity) %>%
group_by(type,year, month) %>%
summarise(total_sales = sum(unit_price * quantity)) %>%
ungroup() %>%
ggplot() +
geom_col(aes(x = total_sales, y = month)) +
labs(title = "Bar Chart Showing Seasonality of Sales",
x = "Sales",
y= "Month")+
facet_wrap(type~year)
## `summarise()` has grouped output by 'type', 'year'. You can override using the `.groups` argument.
Observation: After looking at seasonality in general terms, I wanted to see if sale trends were impacted by consumer type. Evidently, for the most part, they aren’t. This means the company does do most of its sales in winter regardless of customer type. It has to find a way to close on more sales in the other seasons, which should have similar sales figures!
bike_sharing <- read_csv("data/bike_share_hourly.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## instant = col_double(),
## dteday = col_date(format = ""),
## season = col_double(),
## yr = col_double(),
## mnth = col_double(),
## hr = col_double(),
## holiday = col_double(),
## weekday = col_double(),
## workingday = col_double(),
## weathersit = col_double(),
## temp = col_double(),
## atemp = col_double(),
## hum = col_double(),
## windspeed = col_double(),
## casual = col_double(),
## registered = col_double(),
## cnt = col_double()
## )
bike_sharing <- bike_sharing %>%
rename_with(tolower)
bike_sharing %>%
summarise(across(everything(), ~ sum(is.na(.))))
## # A tibble: 1 x 17
## instant dteday season yr mnth hr holiday weekday workingday weathersit
## <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 0 0 0 0 0 0 0 0 0 0
## # … with 7 more variables: temp <int>, atemp <int>, hum <int>, windspeed <int>,
## # casual <int>, registered <int>, cnt <int>
bike_sharing %>%
head(10)
## # A tibble: 10 x 17
## instant dteday season yr mnth hr holiday weekday workingday
## <dbl> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 2011-01-01 1 0 1 0 0 6 0
## 2 2 2011-01-01 1 0 1 1 0 6 0
## 3 3 2011-01-01 1 0 1 2 0 6 0
## 4 4 2011-01-01 1 0 1 3 0 6 0
## 5 5 2011-01-01 1 0 1 4 0 6 0
## 6 6 2011-01-01 1 0 1 5 0 6 0
## 7 7 2011-01-01 1 0 1 6 0 6 0
## 8 8 2011-01-01 1 0 1 7 0 6 0
## 9 9 2011-01-01 1 0 1 8 0 6 0
## 10 10 2011-01-01 1 0 1 9 0 6 0
## # … with 8 more variables: weathersit <dbl>, temp <dbl>, atemp <dbl>,
## # hum <dbl>, windspeed <dbl>, casual <dbl>, registered <dbl>, cnt <dbl>
bike_sharing <- bike_sharing %>%
mutate(season = factor(season),
yr = factor(yr, labels = c("2011", "2012")),
mnth = factor(mnth),
weekday = factor(weekday),
weathersit = factor(weathersit))
bike_sharing <- bike_sharing %>%
mutate(time_of_day = case_when(
hr >= 0 & hr <= 6 ~ 1,
hr >= 7 & hr <= 13 ~ 2,
hr >= 14 & hr <= 18 ~ 3,
hr >= 19 & hr <= 24 ~ 4))
bike_sharing <- bike_sharing %>%
mutate(time_of_day = factor(time_of_day))
temp_density_dist <- bike_sharing %>%
ggplot(aes(x = temp)) +
geom_density() +
labs(title = "Temp Density Dist.", x = "Temperature")
atemp_density_dist <- bike_sharing %>%
ggplot(aes(x = atemp)) +
geom_density() +
labs(title = "Temp. Feel Density Dist.", x = "Temperature Feel")
hum_density_dist <- bike_sharing %>%
ggplot(aes(x = hum)) +
geom_density() +
labs(title = "Humidity Density Dist.", x = "Humidity")
windspeed_density_dist <- bike_sharing %>%
ggplot(aes(x = windspeed)) +
geom_density() +
labs(title = "Windspeed Density Dist.", x = "Windspeed")
grid.arrange(temp_density_dist, atemp_density_dist, hum_density_dist, windspeed_density_dist, ncol = 2, nrow = 2)
Are all variable values suitable for use with the k-NN classification algorithm? Yes, I’d say so because of my answer to the question below (values = normalized).
Do you need to further normalize or standardize the numeric variables? No! The values for each plot on the x axis are between 0 and 1.
casual_riders_line <- bike_sharing %>%
group_by(dteday) %>%
summarise(casual_r_ship = sum(casual))
total_riders_line <- bike_sharing %>%
group_by(dteday) %>%
summarise(total_r_ship = sum(cnt))
registered_riders_line <- bike_sharing %>%
group_by(dteday) %>%
summarise(registered_r_ship = sum(registered))
casual_riders_line_graph <- casual_riders_line %>%
ggplot(mapping = aes(x = dteday, y = casual_r_ship)) +
geom_line() +
labs(title = "Line Graph of Casual Riders over the Years", x = "Date", y = "Casual Riders")
total_riders_line_graph <- total_riders_line %>%
ggplot(mapping = aes(x = dteday, y = total_r_ship)) +
geom_line() +
labs(title = "Line Graph of Total Riders over the Years", x = "Date", y = "Total Riders")
registered_riders_line_graph <- registered_riders_line %>%
ggplot(mapping = aes(x = dteday, y = registered_r_ship)) +
geom_line() +
labs(title = "Line Graph of Registered Riders over the Years", x = "Date", y = "Registered Riders")
grid.arrange(casual_riders_line_graph, total_riders_line_graph, registered_riders_line_graph, ncol = 1, nrow = 3)
set.seed(2021)
bike_sharing_split <- initial_split(bike_sharing, prop = .75)
bike_sharing_train <- training(bike_sharing_split)
bike_sharing_test <- testing(bike_sharing_split)
bike_sharing_train %>%
glimpse()
## Rows: 13,035
## Columns: 18
## $ instant <dbl> 2, 3, 4, 5, 6, 7, 8, 9, 11, 14, 15, 16, 18, 19, 20, 23, 24…
## $ dteday <date> 2011-01-01, 2011-01-01, 2011-01-01, 2011-01-01, 2011-01-0…
## $ season <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ yr <fct> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011…
## $ mnth <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ hr <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 10, 13, 14, 15, 17, 18, 19, 22, 23…
## $ holiday <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ weekday <fct> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 0, 0, 0…
## $ workingday <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ weathersit <fct> 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 2, 2, 2…
## $ temp <dbl> 0.22, 0.22, 0.24, 0.24, 0.24, 0.22, 0.20, 0.24, 0.38, 0.46…
## $ atemp <dbl> 0.2727, 0.2727, 0.2879, 0.2879, 0.2576, 0.2727, 0.2576, 0.…
## $ hum <dbl> 0.80, 0.80, 0.75, 0.75, 0.75, 0.80, 0.86, 0.75, 0.76, 0.72…
## $ windspeed <dbl> 0.0000, 0.0000, 0.0000, 0.0000, 0.0896, 0.0000, 0.0000, 0.…
## $ casual <dbl> 8, 5, 3, 0, 0, 2, 1, 1, 12, 47, 35, 40, 15, 9, 6, 11, 15, …
## $ registered <dbl> 32, 27, 10, 1, 1, 0, 2, 7, 24, 47, 71, 70, 52, 26, 31, 17,…
## $ cnt <dbl> 40, 32, 13, 1, 1, 2, 3, 8, 36, 94, 106, 110, 67, 35, 37, 2…
## $ time_of_day <fct> 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 1, 1, 1…
What sticks out to you? There are more riders in 2012 than 2011.
The dteday variable will not be used as a predictor. Should the year variable be dropped? Honestly, since year was dropped later I’m assuming yes. I think a lot of this has to do with the fact that 2012 has significantly more riders than 2011.
Should both years of data be used to predict total ridership? What if management believed the number of registered users would to remain constant at the 2012 levels? Yes I think that’s ok but more stock should be taken in 2012 data because it has more riders. If registered users was to remain at 2012 levels though, it makes sense to only look at 2012 data.
bike_sharing_recipe <- recipe(cnt ~ ., data = bike_sharing_train) %>%
step_rm(instant, dteday, yr, hr, registered, casual) %>%
step_dummy(time_of_day, season, mnth, weekday, weathersit, one_hot = TRUE) %>%
prep()
bike_sharing_juiced <- juice(bike_sharing_recipe)
bike_sharing_test_baked <- bake(bike_sharing_recipe, new_data = bike_sharing_test)
begin <- 1
for (i in seq(begin, 25, 2)) {
knn_spec <- nearest_neighbor(neighbor = i) %>%
set_engine("kknn") %>%
set_mode("regression")
knn_fit <- knn_spec %>%
fit(cnt ~., data = bike_sharing_juiced)
train_knn_fit <- knn_fit %>%
predict(bike_sharing_juiced) %>%
bind_cols(bike_sharing_juiced) %>%
metrics(truth = cnt, estimate = .pred) %>%
mutate(k = i)
test_knn_fit <- knn_fit %>%
predict(bike_sharing_test_baked) %>%
bind_cols(bike_sharing_test_baked) %>%
metrics(truth = cnt, estimate = .pred) %>%
mutate(k = i)
if (i == begin) {
summary_train_knn_fit <- train_knn_fit
summary_test_knn_fit <- test_knn_fit
} else {
summary_train_knn_fit <- bind_rows(summary_train_knn_fit, train_knn_fit)
summary_test_knn_fit <- bind_rows(summary_test_knn_fit, test_knn_fit)
}
}
summary_train_knn_fit %>%
filter(.metric %in% c("rmse", "rsq"))
## # A tibble: 26 x 4
## .metric .estimator .estimate k
## <chr> <chr> <dbl> <dbl>
## 1 rmse standard 22.4 1
## 2 rsq standard 0.985 1
## 3 rmse standard 42.2 3
## 4 rsq standard 0.948 3
## 5 rmse standard 59.4 5
## 6 rsq standard 0.897 5
## 7 rmse standard 69.6 7
## 8 rsq standard 0.858 7
## 9 rmse standard 76.6 9
## 10 rsq standard 0.828 9
## # … with 16 more rows
summary_test_knn_fit %>%
filter(.metric %in% c("rmse", "rsq"))
## # A tibble: 26 x 4
## .metric .estimator .estimate k
## <chr> <chr> <dbl> <dbl>
## 1 rmse standard 139. 1
## 2 rsq standard 0.495 1
## 3 rmse standard 125. 3
## 4 rsq standard 0.548 3
## 5 rmse standard 121. 5
## 6 rsq standard 0.566 5
## 7 rmse standard 119. 7
## 8 rsq standard 0.573 7
## 9 rmse standard 118. 9
## 10 rsq standard 0.576 9
## # … with 16 more rows
What is the best k value? Why? K-Value = 10. Based on the graphs below it appears that the graph starts bending between 9 and 10 (I went with 10, though).
summary_train_knn_fit %>%
filter(.metric == "rmse") %>%
ggplot(aes(x = k, y = .estimate)) +
geom_point() +
geom_line() +
labs(title = "Bike Sharing Training Data RMSE by k values",
x = "k Values",
y = "RMSE")
summary_test_knn_fit %>%
filter(.metric == "rmse") %>%
ggplot(aes(x = k, y = .estimate)) +
geom_point() +
geom_line() +
labs(title = "Bike Sharing Testing Data RMSE by k values",
x = "k Values",
y = "RMSE")
final_bike_sharing_baked <- bake(bike_sharing_recipe, new_data = bike_sharing)
final_knn_spec <- nearest_neighbor(neighbor = 10) %>%
set_engine("kknn") %>%
set_mode("regression")
final_knn_fit <- final_knn_spec %>%
fit(cnt ~., data = final_bike_sharing_baked)
final_knn_fit %>%
predict(final_bike_sharing_baked) %>%
bind_cols(final_bike_sharing_baked) %>%
select(cnt, .pred) %>%
ggplot(aes(x = cnt, y = .pred)) +
geom_abline(lty = 2) +
geom_point(alpha = 0.5) +
labs(title = "Bike Sharing Data K-NN Regression Fit to a Line",
y = "Predicted Bike Riders",
x = "Actual Bike Riders") +
coord_obs_pred()
Interpretation: It appears that the dots don’t fit along the line very well.
#apply_model <- tibble( time_of_day_X1 = 1,
#time_of_day_X2 = 0,
#time_of_day_X3 = 0,
#time_of_day_X4 = 0,
#season_X1 = 1,
#season_X2 = 0,
#season_X3 = 0,
#season_X4 = 0,
#mnth_X1 = 0,
#mnth_X2 = 1,
#mnth_X3 = 0,
#mnth_X4 = 0,
#mnth_X5 = 0,
#mnth_X6 = 0,
#mnth_X7 = 0,
#mnth_X8 = 0,
#mnth_X9 = 0,
#mnth_X10 = 0,
#mnth_X11 = 0,
#mnth_X12 = 0,
#weekday_X0 = 0,
#weekday_X1 = 0,
#weekday_X2 = 0,
#weekday_X3 = 0,
#weekday_X4 = 0,
#weekday_X5 = 0,
#weekday_X6 = 1,
#workingday_X0 = 1,
#workingday_X1 = 0,
#weathersit_X1 = 0,
#weathersit_X2 = 1,
#weathersit_X3 = 0,
#weathersit_X4 = 0,
#holiday_X1 = 1,
#holiday_X2 = 0,
#temp = 5,
#atemp = 3,
#hum = 60,
#windspeed = 0
#)
#predict(final_knn_fit, new_data = apply_model)
SORRY I couldn’t figure this part out although I think I’m close based on Professor Ballenger’s advice. I think it might not be working becuase some of the data might have to be normalized now, although I’m not sure.
bike_sharing_2012 <- bike_sharing %>%
filter(yr == 2012)
set.seed(2021)
bike_sharing_split_2012 <- initial_split(bike_sharing_2012, prop = .75)
bike_sharing_train_2012 <- training(bike_sharing_split_2012)
bike_sharing_test_2012 <- testing(bike_sharing_split_2012)
bike_sharing_train_2012 %>%
glimpse()
## Rows: 6,551
## Columns: 18
## $ instant <dbl> 8647, 8648, 8649, 8650, 8651, 8652, 8653, 8654, 8658, 8659…
## $ dteday <date> 2012-01-01, 2012-01-01, 2012-01-01, 2012-01-01, 2012-01-0…
## $ season <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ yr <fct> 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012…
## $ mnth <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ hr <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 12, 13, 14, 15, 16, 17, 18, 20, 21…
## $ holiday <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
## $ weekday <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
## $ workingday <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ weathersit <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 2, 2, 1, 1, 1…
## $ temp <dbl> 0.36, 0.32, 0.30, 0.28, 0.28, 0.26, 0.26, 0.26, 0.40, 0.42…
## $ atemp <dbl> 0.3485, 0.3485, 0.3333, 0.3030, 0.2879, 0.2727, 0.2576, 0.…
## $ hum <dbl> 0.66, 0.76, 0.81, 0.81, 0.81, 0.93, 0.93, 0.87, 0.62, 0.58…
## $ windspeed <dbl> 0.1343, 0.0000, 0.0000, 0.0896, 0.1045, 0.1343, 0.1642, 0.…
## $ casual <dbl> 15, 16, 11, 0, 0, 1, 1, 4, 58, 82, 120, 101, 68, 36, 25, 2…
## $ registered <dbl> 78, 59, 41, 8, 5, 1, 6, 10, 143, 141, 147, 164, 147, 75, 8…
## $ cnt <dbl> 93, 75, 52, 8, 5, 2, 7, 14, 201, 223, 267, 265, 215, 111, …
## $ time_of_day <fct> 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 1…
bike_sharing_recipe_2012 <- recipe(cnt ~ ., data = bike_sharing_train_2012) %>%
step_rm(instant, dteday, yr, hr, registered, casual) %>%
step_dummy(season, mnth, weekday, weathersit, one_hot = TRUE) %>%
prep()
bike_sharing_juiced_2012 <- juice(bike_sharing_recipe_2012)
bike_sharing_test_baked_2012 <- bake(bike_sharing_recipe_2012, new_data = bike_sharing_test_2012)
begin_2012 <- 1
for (i in seq(begin, 25, 2)) {
knn_spec_2012 <- nearest_neighbor(neighbor = i) %>%
set_engine("kknn") %>%
set_mode("regression")
knn_fit_2012 <- knn_spec_2012 %>%
fit(cnt ~., data = bike_sharing_juiced_2012)
train_knn_fit_2012 <- knn_fit_2012 %>%
predict(bike_sharing_juiced_2012) %>%
bind_cols(bike_sharing_juiced_2012) %>%
metrics(truth = cnt, estimate = .pred) %>%
mutate(k = i)
test_knn_fit_2012 <- knn_fit_2012 %>%
predict(bike_sharing_test_baked_2012) %>%
bind_cols(bike_sharing_test_baked_2012) %>%
metrics(truth = cnt, estimate = .pred) %>%
mutate(k = i)
if (i == begin) {
summary_train_knn_fit_2012 <- train_knn_fit_2012
summary_test_knn_fit_2012 <- test_knn_fit_2012
} else {
summary_train_knn_fit_2012 <- bind_rows(summary_train_knn_fit, train_knn_fit)
summary_test_knn_fit_2012 <- bind_rows(summary_test_knn_fit_2012, test_knn_fit_2012)
}
}
summary_train_knn_fit_2012 %>%
filter(.metric %in% c("rmse", "rsq"))
## # A tibble: 28 x 4
## .metric .estimator .estimate k
## <chr> <chr> <dbl> <dbl>
## 1 rmse standard 22.4 1
## 2 rsq standard 0.985 1
## 3 rmse standard 42.2 3
## 4 rsq standard 0.948 3
## 5 rmse standard 59.4 5
## 6 rsq standard 0.897 5
## 7 rmse standard 69.6 7
## 8 rsq standard 0.858 7
## 9 rmse standard 76.6 9
## 10 rsq standard 0.828 9
## # … with 18 more rows
summary_test_knn_fit_2012 %>%
filter(.metric %in% c("rmse", "rsq"))
## # A tibble: 26 x 4
## .metric .estimator .estimate k
## <chr> <chr> <dbl> <dbl>
## 1 rmse standard 161. 1
## 2 rsq standard 0.500 1
## 3 rmse standard 144. 3
## 4 rsq standard 0.562 3
## 5 rmse standard 138. 5
## 6 rsq standard 0.582 5
## 7 rmse standard 136. 7
## 8 rsq standard 0.589 7
## 9 rmse standard 136. 9
## 10 rsq standard 0.588 9
## # … with 16 more rows
What is the best k value with this model? K-Value = 7. Based on the graphs below it appears that the graph starts bending between 6 and 8 (I went with 7, though).
summary_train_knn_fit_2012 %>%
filter(.metric == "rmse") %>%
ggplot(aes(x = k, y = .estimate)) +
geom_point() +
geom_line() +
labs(title = "Bike Sharing Training Data 2012 RMSE by k values",
x = "k Values",
y = "RMSE")
summary_test_knn_fit_2012 %>%
filter(.metric == "rmse") %>%
ggplot(aes(x = k, y = .estimate)) +
geom_point() +
geom_line() +
labs(title = "Bike Sharing Testing Data 2012 RMSE by k values",
x = "k Values",
y = "RMSE")
final_bike_sharing_baked_2012 <- bake(bike_sharing_recipe_2012, new_data = bike_sharing_2012)
final_knn_spec_2012 <- nearest_neighbor(neighbor = 7) %>%
set_engine("kknn") %>%
set_mode("regression")
final_knn_fit_2012 <- final_knn_spec_2012 %>%
fit(cnt ~., data = final_bike_sharing_baked_2012)
final_knn_fit_2012 %>%
predict(final_bike_sharing_baked_2012) %>%
bind_cols(final_bike_sharing_baked_2012) %>%
select(cnt, .pred) %>%
ggplot(aes(x = cnt, y = .pred)) +
geom_abline(lty = 2) +
geom_point(alpha = 0.5) +
labs(title = "Bike Sharing Data 2012 K-NN Regression Fit to a Line",
y = "Predicted Bike Riders",
x = "Actual Bike Riders") +
coord_obs_pred()
Interpretation: The dots fit along the line much better than in the previous model.
Which model produces the most accurate results?
The 2012 model is a bit better, the dots fit along the line better for the regression.
#apply_model <- tibble( time_of_day_X1 = 1,
#time_of_day_X2 = 0,
#time_of_day_X3 = 0,
#time_of_day_X4 = 0,
#season_X1 = 1,
#season_X2 = 0,
#season_X3 = 0,
#season_X4 = 0,
#mnth_X1 = 0,
#mnth_X2 = 1,
#mnth_X3 = 0,
#mnth_X4 = 0,
#mnth_X5 = 0,
#mnth_X6 = 0,
#mnth_X7 = 0,
#mnth_X8 = 0,
#mnth_X9 = 0,
#mnth_X10 = 0,
#mnth_X11 = 0,
#mnth_X12 = 0,
#weekday_X0 = 0,
#weekday_X1 = 0,
#weekday_X2 = 0,
#weekday_X3 = 0,
#weekday_X4 = 0,
#weekday_X5 = 0,
#weekday_X6 = 1,
#workingday_X0 = 1,
#workingday_X1 = 0,
#weathersit_X1 = 0,
#weathersit_X2 = 1,
#weathersit_X3 = 0,
#weathersit_X4 = 0,
#holiday_X1 = 1,
#holiday_X2 = 0,
#temp = 5,
#atemp = 3,
#hum = 60,
#windspeed = 0
#)
#predict(final_knn_fit, new_data = apply_model)
SORRY I couldn’t figure this part out although I think I’m close based on Professor Ballenger’s advice. I think it might not be working becuase some of the data might have to be normalized now, although I’m not sure.
auto_segments <- read_csv("data/auto_segments.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## ID = col_double(),
## Gender = col_character(),
## Ever_Married = col_character(),
## Age = col_double(),
## Graduated = col_character(),
## Profession = col_character(),
## Work_Experience = col_double(),
## Spending_Score = col_character(),
## Family_Size = col_double(),
## Var_1 = col_character(),
## Segmentation = col_character()
## )
auto_segments <- auto_segments %>%
rename_with(tolower)
auto_segments %>%
summarise(across(everything(), ~ sum(is.na(.))))
## # A tibble: 1 x 11
## id gender ever_married age graduated profession work_experience
## <int> <int> <int> <int> <int> <int> <int>
## 1 0 0 190 0 102 161 1097
## # … with 4 more variables: spending_score <int>, family_size <int>,
## # var_1 <int>, segmentation <int>
auto_segments <- na.omit(auto_segments)
auto_segments %>%
summarise(across(everything(), ~ sum(is.na(.))))
## # A tibble: 1 x 11
## id gender ever_married age graduated profession work_experience
## <int> <int> <int> <int> <int> <int> <int>
## 1 0 0 0 0 0 0 0
## # … with 4 more variables: spending_score <int>, family_size <int>,
## # var_1 <int>, segmentation <int>
auto_segments %>%
head(10)
## # A tibble: 10 x 11
## id gender ever_married age graduated profession work_experience
## <dbl> <chr> <chr> <dbl> <chr> <chr> <dbl>
## 1 462809 Male No 22 No Healthcare 1
## 2 466315 Female Yes 67 Yes Engineer 1
## 3 461735 Male Yes 67 Yes Lawyer 0
## 4 461319 Male Yes 56 No Artist 0
## 5 460156 Male No 32 Yes Healthcare 1
## 6 464347 Female No 33 Yes Healthcare 1
## 7 465015 Female Yes 61 Yes Engineer 0
## 8 465176 Female Yes 55 Yes Artist 1
## 9 464041 Female No 26 Yes Engineer 1
## 10 464942 Male No 19 No Healthcare 4
## # … with 4 more variables: spending_score <chr>, family_size <dbl>,
## # var_1 <chr>, segmentation <chr>
#I couldn't figure the cross tab out.
auto_segments <- auto_segments %>%
mutate(gender = factor(gender),
ever_married = factor(ever_married),
graduated = factor(graduated),
profession = factor(profession),
spending_score = factor(spending_score),
var_1 = factor(var_1),
segmentation = factor(segmentation))
set.seed(2021)
auto_segments_split <- initial_split(auto_segments, prop = .75)
auto_segments_train <- training(auto_segments_split)
auto_segments_test <- testing(auto_segments_split)
auto_segments_train %>%
glimpse()
## Rows: 6,609
## Columns: 11
## $ id <dbl> 466315, 461735, 461319, 460156, 464347, 465015, 465176…
## $ gender <fct> Female, Male, Male, Male, Female, Female, Female, Fema…
## $ ever_married <fct> Yes, Yes, Yes, No, No, Yes, Yes, No, No, No, Yes, Yes,…
## $ age <dbl> 67, 67, 56, 32, 33, 61, 55, 26, 32, 31, 58, 79, 49, 18…
## $ graduated <fct> Yes, Yes, No, Yes, Yes, Yes, Yes, Yes, No, No, Yes, Ye…
## $ profession <fct> Engineer, Lawyer, Artist, Healthcare, Healthcare, Engi…
## $ work_experience <dbl> 1, 0, 0, 1, 1, 0, 1, 1, 9, 1, 1, 0, 12, 3, 13, 1, 9, 1…
## $ spending_score <fct> Low, High, Average, Low, Low, Low, Average, Low, Low, …
## $ family_size <dbl> 1, 2, 2, 3, 3, 3, 4, 3, 5, 6, 4, 1, 1, 4, 2, 3, 8, 3, …
## $ var_1 <fct> Cat_6, Cat_6, Cat_6, Cat_6, Cat_6, Cat_7, Cat_6, Cat_6…
## $ segmentation <fct> B, B, C, C, D, D, C, A, D, B, B, C, A, D, A, C, A, C, …
auto_segments_recipe <- recipe(segmentation ~ ., data = auto_segments_train) %>%
step_rm(id) %>%
step_dummy(gender, ever_married, graduated, one_hot = FALSE) %>%
step_dummy(spending_score, profession, var_1, one_hot = TRUE) %>%
step_center(age, work_experience, family_size) %>%
step_scale(age, work_experience, family_size) %>%
prep()
auto_segments_juiced <- juice(auto_segments_recipe)
auto_segments_test_baked <- bake(auto_segments_recipe, new_data = auto_segments_test)
begin_auto <- 1
for (i in seq(begin_auto, 25, 2)) {
auto_knn_spec <- nearest_neighbor(neighbor = i) %>%
set_engine("kknn") %>%
set_mode("classification")
auto_knn_fit <- auto_knn_spec %>%
fit(segmentation ~., data = auto_segments_juiced)
auto_train_knn_fit <- auto_knn_fit %>%
predict(auto_segments_juiced) %>%
bind_cols(auto_segments_juiced) %>%
metrics(truth = segmentation, estimate = .pred_class) %>%
mutate(k = i)
auto_test_knn_fit <- auto_knn_fit %>%
predict(auto_segments_test_baked) %>%
bind_cols(auto_segments_test_baked) %>%
metrics(truth = segmentation, estimate = .pred_class) %>%
mutate(k = i)
if (i == begin) {
auto_summary_train_knn_fit <- auto_train_knn_fit
auto_summary_test_knn_fit <- auto_test_knn_fit
} else {
auto_summary_train_knn_fit <- bind_rows(auto_summary_train_knn_fit, auto_train_knn_fit)
auto_summary_test_knn_fit <- bind_rows(auto_summary_test_knn_fit, auto_test_knn_fit)
}
}
auto_summary_train_knn_fit
## # A tibble: 26 x 4
## .metric .estimator .estimate k
## <chr> <chr> <dbl> <dbl>
## 1 accuracy multiclass 0.936 1
## 2 kap multiclass 0.915 1
## 3 accuracy multiclass 0.939 3
## 4 kap multiclass 0.918 3
## 5 accuracy multiclass 0.938 5
## 6 kap multiclass 0.917 5
## 7 accuracy multiclass 0.846 7
## 8 kap multiclass 0.794 7
## 9 accuracy multiclass 0.781 9
## 10 kap multiclass 0.707 9
## # … with 16 more rows
auto_summary_test_knn_fit
## # A tibble: 26 x 4
## .metric .estimator .estimate k
## <chr> <chr> <dbl> <dbl>
## 1 accuracy multiclass 0.380 1
## 2 kap multiclass 0.172 1
## 3 accuracy multiclass 0.380 3
## 4 kap multiclass 0.171 3
## 5 accuracy multiclass 0.380 5
## 6 kap multiclass 0.171 5
## 7 accuracy multiclass 0.409 7
## 8 kap multiclass 0.209 7
## 9 accuracy multiclass 0.429 9
## 10 kap multiclass 0.236 9
## # … with 16 more rows
What is the best k value? Why?
auto_summary_train_knn_fit %>%
filter(.metric == "accuracy") %>%
ggplot(aes(x = k, y = .estimate)) +
geom_point() +
geom_line() +
labs(title = "Auto Segments Training Data Accuracy by k values",
x = "k Values",
y = "Accuracy")
auto_summary_test_knn_fit %>%
filter(.metric == "accuracy") %>%
ggplot(aes(x = k, y = .estimate)) +
geom_point() +
geom_line() +
labs(title = "Auto Segments Testing Data Accuracy by k values",
x = "k Values",
y = "Accuracy")
#apply_model <- tibble()
#predict(final_knn_fit, new_data = apply_model)
SORRY I couldn’t figure this part out, just wasn’t happening for me. I understand what the structure should be (as evidenced in part 2) but I couldn’t quite figure this out.
I relied on the labs and presentations A LOT to do all parts. Professsor Ballenger was also INSTRUMENTAL in helping me figure out the knn problems.
I used this function to figure out how to get rid of missing data: https://datascienceplus.com/missing-values-in-r/#:~:text=In%20order%20to%20let%20R,you%20need%20to%20recode%20it.&text=Another%20useful%20function%20in%20R,()%20which%20delete%20incomplete%20observations.
I used this to understand the function n() better: https://www.rdocumentation.org/packages/dplyr/versions/0.7.8/topics/n
I used this to understand how to make density plots: https://www.youtube.com/watch?v=uLY8nMA7ejE
KNITTING ERROR: Date and Time - 4/15/21 around 11:00 PM Question: Hey Brennan I’m getting a weird knitting error, it keeps saying “lon” and “lat” not found. Any suggestions? Response: Have you tried commenting some things out? Person Asked: Brennan
“On my honor, I have neither given nor received any unacknowledged aid on this project.” -Ryan Brands 4/16/2021